https://olympus.greatlearning.in/courses/40608/assignments/123646
To predict which customer is more likely to purchase the newly introduced travel package.
# standard imports
# note i had to install xgboost via pip as it is external package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sklearn stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import scipy.stats as stats
from sklearn import metrics
from sklearn import tree
from sklearn.model_selection import GridSearchCV
#ensemble apis
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
#To install xgboost library use - !pip install xgboost
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
data_orig= pd.read_excel('Tourism.xlsx',sheet_name='Tourism')
# copying data to another varaible to avoid any changes to original data
data=data_orig.copy()
data.head()
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisited | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisited | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200000 | 1 | 41.0 | Self Enquiry | 3 | 6.0 | Salaried | Female | 3 | 3.0 | Deluxe | 3.0 | Single | 1.0 | 1 | 2 | 1 | 0.0 | Manager | 20993.0 |
| 1 | 200001 | 0 | 49.0 | Company Invited | 1 | 14.0 | Salaried | Male | 3 | 4.0 | Deluxe | 4.0 | Divorced | 2.0 | 0 | 3 | 1 | 2.0 | Manager | 20130.0 |
| 2 | 200002 | 1 | 37.0 | Self Enquiry | 1 | 8.0 | Free Lancer | Male | 3 | 4.0 | Basic | 3.0 | Single | 7.0 | 1 | 3 | 0 | 0.0 | Executive | 17090.0 |
| 3 | 200003 | 0 | 33.0 | Company Invited | 1 | 9.0 | Salaried | Female | 2 | 3.0 | Basic | 3.0 | Divorced | 2.0 | 1 | 5 | 1 | 1.0 | Executive | 17909.0 |
| 4 | 200004 | 0 | NaN | Self Enquiry | 1 | 8.0 | Small Business | Male | 2 | 3.0 | Basic | 4.0 | Divorced | 1.0 | 0 | 5 | 1 | 0.0 | Executive | 18468.0 |
data.tail()
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisited | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisited | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4883 | 204883 | 1 | 49.0 | Self Enquiry | 3 | 9.0 | Small Business | Male | 3 | 5.0 | Deluxe | 4.0 | Unmarried | 2.0 | 1 | 1 | 1 | 1.0 | Manager | 26576.0 |
| 4884 | 204884 | 1 | 28.0 | Company Invited | 1 | 31.0 | Salaried | Male | 4 | 5.0 | Basic | 3.0 | Single | 3.0 | 1 | 3 | 1 | 2.0 | Executive | 21212.0 |
| 4885 | 204885 | 1 | 52.0 | Self Enquiry | 3 | 17.0 | Salaried | Female | 4 | 4.0 | Standard | 4.0 | Married | 7.0 | 0 | 1 | 1 | 3.0 | Senior Manager | 31820.0 |
| 4886 | 204886 | 1 | 19.0 | Self Enquiry | 3 | 16.0 | Small Business | Male | 3 | 4.0 | Basic | 3.0 | Single | 3.0 | 0 | 5 | 0 | 2.0 | Executive | 20289.0 |
| 4887 | 204887 | 1 | 36.0 | Self Enquiry | 1 | 14.0 | Salaried | Male | 4 | 4.0 | Basic | 4.0 | Unmarried | 3.0 | 1 | 3 | 1 | 2.0 | Executive | 24041.0 |
# Shape?
data.shape
(4888, 20)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4888 entries, 0 to 4887 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 4888 non-null int64 1 ProdTaken 4888 non-null int64 2 Age 4662 non-null float64 3 TypeofContact 4863 non-null object 4 CityTier 4888 non-null int64 5 DurationOfPitch 4637 non-null float64 6 Occupation 4888 non-null object 7 Gender 4888 non-null object 8 NumberOfPersonVisited 4888 non-null int64 9 NumberOfFollowups 4843 non-null float64 10 ProductPitched 4888 non-null object 11 PreferredPropertyStar 4862 non-null float64 12 MaritalStatus 4888 non-null object 13 NumberOfTrips 4748 non-null float64 14 Passport 4888 non-null int64 15 PitchSatisfactionScore 4888 non-null int64 16 OwnCar 4888 non-null int64 17 NumberOfChildrenVisited 4822 non-null float64 18 Designation 4888 non-null object 19 MonthlyIncome 4655 non-null float64 dtypes: float64(7), int64(7), object(6) memory usage: 763.9+ KB
#count the number of variables in each datatype
data.dtypes.value_counts()
int64 7 float64 7 object 6 dtype: int64
# dups
sum(data.duplicated())
0
# NOTE - Profile- takes a 1 min
# This will allow me to see once glance data which needs to change type /missing values for imputation
from pandas_profiling import ProfileReport
profile = ProfileReport(data)
profile
# Conver object to categorical
for col in data.columns:
if data[col].dtype == 'object':
data[col] = pd.Categorical(data[col])
data.sample(n=5)
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisited | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisited | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 438 | 200438 | 0 | 35.0 | Self Enquiry | 1 | 22.0 | Salaried | Male | 3 | 3.0 | Standard | 3.0 | Divorced | 5.0 | 1 | 2 | 0 | 1.0 | Senior Manager | 22632.0 |
| 2356 | 202356 | 0 | NaN | Self Enquiry | 1 | 7.0 | Small Business | Male | 3 | 3.0 | Basic | 4.0 | Married | 1.0 | 1 | 4 | 1 | 2.0 | Executive | 18579.0 |
| 383 | 200383 | 0 | 22.0 | Self Enquiry | 1 | 17.0 | Small Business | Female | 2 | 3.0 | Basic | 4.0 | Divorced | 2.0 | 0 | 2 | 1 | 0.0 | Executive | 17244.0 |
| 4010 | 204010 | 0 | 25.0 | Self Enquiry | 3 | 10.0 | Salaried | Female | 3 | 4.0 | Deluxe | 3.0 | Married | 2.0 | 0 | 4 | 0 | 1.0 | Manager | 23677.0 |
| 2882 | 202882 | 0 | 36.0 | Self Enquiry | 1 | 23.0 | Salaried | Male | 4 | 4.0 | Standard | 3.0 | Divorced | 6.0 | 1 | 2 | 0 | 2.0 | Senior Manager | 26310.0 |
data.describe()
| CustomerID | ProdTaken | Age | CityTier | DurationOfPitch | NumberOfPersonVisited | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisited | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4888.000000 | 4888.000000 | 4662.000000 | 4888.000000 | 4637.000000 | 4888.000000 | 4843.000000 | 4862.000000 | 4748.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4822.000000 | 4655.000000 |
| mean | 202443.500000 | 0.188216 | 37.622265 | 1.654255 | 15.490835 | 2.905074 | 3.708445 | 3.581037 | 3.236521 | 0.290917 | 3.078151 | 0.620295 | 1.187267 | 23619.853491 |
| std | 1411.188388 | 0.390925 | 9.316387 | 0.916583 | 8.519643 | 0.724891 | 1.002509 | 0.798009 | 1.849019 | 0.454232 | 1.365792 | 0.485363 | 0.857861 | 5380.698361 |
| min | 200000.000000 | 0.000000 | 18.000000 | 1.000000 | 5.000000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1000.000000 |
| 25% | 201221.750000 | 0.000000 | 31.000000 | 1.000000 | 9.000000 | 2.000000 | 3.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | 20346.000000 |
| 50% | 202443.500000 | 0.000000 | 36.000000 | 1.000000 | 13.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | 22347.000000 |
| 75% | 203665.250000 | 0.000000 | 44.000000 | 3.000000 | 20.000000 | 3.000000 | 4.000000 | 4.000000 | 4.000000 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | 25571.000000 |
| max | 204887.000000 | 1.000000 | 61.000000 | 3.000000 | 127.000000 | 5.000000 | 6.000000 | 5.000000 | 22.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | 98678.000000 |
# checking for unique values in ID column
data["CustomerID"].nunique()
4888
# drop it - no added valued
data.drop(["CustomerID"],axis=1,inplace=True)
data.columns
Index(['ProdTaken', 'Age', 'TypeofContact', 'CityTier', 'DurationOfPitch',
'Occupation', 'Gender', 'NumberOfPersonVisited', 'NumberOfFollowups',
'ProductPitched', 'PreferredPropertyStar', 'MaritalStatus',
'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar',
'NumberOfChildrenVisited', 'Designation', 'MonthlyIncome'],
dtype='object')
data["Age"].unique()
array([41., 49., 37., 33., nan, 32., 59., 30., 38., 36., 35., 31., 34.,
28., 29., 22., 53., 21., 42., 44., 46., 39., 24., 43., 50., 27.,
26., 48., 55., 45., 56., 23., 51., 40., 54., 58., 20., 25., 19.,
57., 52., 47., 18., 60., 61.])
data["TypeofContact"].unique()
['Self Enquiry', 'Company Invited', NaN] Categories (2, object): ['Self Enquiry', 'Company Invited']
Null vals in TypeOfContact
data["CityTier"].value_counts()
1 3190 3 1500 2 198 Name: CityTier, dtype: int64
data["DurationOfPitch"].unique()
array([ 6., 14., 8., 9., 30., 29., 33., 22., 21., 32., 25.,
27., 11., 17., 15., 13., 12., 16., 10., 31., 18., nan,
24., 35., 28., 20., 26., 34., 23., 5., 19., 126., 7.,
36., 127.])
There are Null Values in DurationOfPitch variable
data["Occupation"].value_counts()
Salaried 2368 Small Business 2084 Large Business 434 Free Lancer 2 Name: Occupation, dtype: int64
data["Gender"].value_counts()
Male 2916 Female 1817 Fe Male 155 Name: Gender, dtype: int64
Look at that typo in Fe Male - need to fix that
# Change Fe Male to Female
data.Gender.replace('Fe Male','Female',inplace=True)
data["Gender"].value_counts()
Male 2916 Female 1972 Name: Gender, dtype: int64
data["NumberOfPersonVisited"].value_counts()
3 2402 2 1418 4 1026 1 39 5 3 Name: NumberOfPersonVisited, dtype: int64
data["NumberOfFollowups"].value_counts()
4.0 2068 3.0 1466 5.0 768 2.0 229 1.0 176 6.0 136 Name: NumberOfFollowups, dtype: int64
data["ProductPitched"].value_counts()
Basic 1842 Deluxe 1732 Standard 742 Super Deluxe 342 King 230 Name: ProductPitched, dtype: int64
data["PreferredPropertyStar"].value_counts()
3.0 2993 5.0 956 4.0 913 Name: PreferredPropertyStar, dtype: int64
data["MaritalStatus"].value_counts()
Married 2340 Divorced 950 Single 916 Unmarried 682 Name: MaritalStatus, dtype: int64
data["NumberOfTrips"].value_counts()
2.0 1464 3.0 1079 1.0 620 4.0 478 5.0 458 6.0 322 7.0 218 8.0 105 20.0 1 19.0 1 22.0 1 21.0 1 Name: NumberOfTrips, dtype: int64
data["Passport"].value_counts()
0 3466 1 1422 Name: Passport, dtype: int64
data["PitchSatisfactionScore"].value_counts()
3 1478 5 970 1 942 4 912 2 586 Name: PitchSatisfactionScore, dtype: int64
data["OwnCar"].value_counts()
1 3032 0 1856 Name: OwnCar, dtype: int64
data["Designation"].value_counts()
Executive 1842 Manager 1732 Senior Manager 742 AVP 342 VP 230 Name: Designation, dtype: int64
data["MonthlyIncome"].unique()
array([20993., 20130., 17090., ..., 22097., 22995., 21471.])
data["ProdTaken"].value_counts(normalize=True)
0 0.811784 1 0.188216 Name: ProdTaken, dtype: float64
data.isnull().sum().sort_values()
ProdTaken 0 OwnCar 0 PitchSatisfactionScore 0 Passport 0 MaritalStatus 0 Designation 0 NumberOfPersonVisited 0 ProductPitched 0 Occupation 0 CityTier 0 Gender 0 TypeofContact 25 PreferredPropertyStar 26 NumberOfFollowups 45 NumberOfChildrenVisited 66 NumberOfTrips 140 Age 226 MonthlyIncome 233 DurationOfPitch 251 dtype: int64
# Print all numeric columns
cols_numeric=['ProdTaken','CityTier','NumberOfPersonVisited','Passport','PitchSatisfactionScore','OwnCar','Age','DurationOfPitch','NumberOfFollowups','PreferredPropertyStar','NumberOfTrips','NumberOfChildrenVisited','MonthlyIncome']
plt.figure(figsize=(20,80))
for i in range(len(cols_numeric)):
plt.subplot(5,3,i+1)
plt.hist(data[cols_numeric[i]])
plt.tight_layout()
plt.title(cols_numeric[i])
plt.show()
#View the distribution of the Age variable
plt.figure(figsize=(20,20))
plt.subplot(3,1,1)
sns.histplot(x=data.Age,color="green",kde=True);
plt.axvline(data["Age"].mean(),color='m',linewidth=3)# draw a line indicating mean
plt.axvline(data["Age"].median(),color='b',linestyle='dashed',linewidth=2)# draw a line indicating median
plt.axvline(data["Age"].mode()[0],color='w',linestyle='dashed',linewidth=1)# draw a line indicating mode
plt.subplot(3,1,2)
sns.boxplot(x=data.Age)
<AxesSubplot:xlabel='Age'>
#View the distribution of the DurationOfPitch variable
plt.figure(figsize=(20,20))
plt.subplot(3,1,1)
sns.histplot(x=data.DurationOfPitch,color="red",kde=True);
plt.axvline(data["DurationOfPitch"].mean(),color='m',linewidth=3)# draw a line indicating mean
plt.axvline(data["DurationOfPitch"].median(),color='b',linestyle='dashed',linewidth=2)# draw a line indicating median
plt.axvline(data["DurationOfPitch"].mode()[0],color='w',linestyle='dashed',linewidth=1)# draw a line indicating mode
plt.subplot(3,1,2)
sns.boxplot(x=data.DurationOfPitch);
Observations:
#View the distribution of the MonthlyIncome variable
plt.figure(figsize=(10,10))
plt.subplot(3,1,1)
sns.histplot(x=data.MonthlyIncome,color="purple",kde=True);
plt.axvline(data["MonthlyIncome"].mean(),color='m',linewidth=3)# draw a line indicating mean
plt.axvline(data["MonthlyIncome"].median(),color='b',linestyle='dashed',linewidth=2)# draw a line indicating median
plt.axvline(data["MonthlyIncome"].mode()[0],color='w',linestyle='dashed',linewidth=1)# draw a line indicating mode
plt.subplot(3,1,2)
sns.boxplot(x=data.MonthlyIncome);
# Function from lecture notes to create barplots - with % per cat
def bar_perc(plot, feature):
'''
plot
feature: 1-d categorical feature array
'''
total = len(feature) # length of the column
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size = 12) # annotate the percantage
#Look at CityTier size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["CityTier"],palette='winter');
bar_perc(ax,data["CityTier"])
Observations:
#View the countplot on NumberOfPersonVisited size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["NumberOfPersonVisited"],palette='winter');
bar_perc(ax,data["NumberOfPersonVisited"])
#View the countplot on PitchSatisfactionScore size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["PitchSatisfactionScore"],palette='winter');
bar_perc(ax,data["PitchSatisfactionScore"])
#View the countplot on OwnCar size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["OwnCar"],palette='winter');
bar_perc(ax,data["OwnCar"])
#View the countplot on NumberOfFollowups size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["NumberOfFollowups"],palette='winter');
bar_perc(ax,data["NumberOfFollowups"])
#View PreferredPropertyStar size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["PreferredPropertyStar"],palette='winter');
bar_perc(ax,data["PreferredPropertyStar"])
#View NumberOfTrips
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["NumberOfTrips"],palette='winter');
bar_perc(ax,data["NumberOfTrips"])
#View NumberOfChildrenVisited size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["NumberOfChildrenVisited"],palette='winter');
bar_perc(ax,data["NumberOfChildrenVisited"])
#Gender size
plt.figure(figsize=(7,5))
ax = sns.countplot(x=data["Gender"],palette='winter');
bar_perc(ax,data["Gender"])
corr=data.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr, annot=True,cmap='coolwarm',
fmt=".2f",
xticklabels=corr.columns,
yticklabels=corr.columns);
Observations:
sns.pairplot(data, corner=True);
## Utility to plot stacked bar chart - src Course Notes
def stacked_plot_util(x):
sns.set(palette='nipy_spectral')
tab1 = pd.crosstab(x,data['ProductPitched'],margins=True)
print(tab1)
print('-'*120)
tab2 = pd.crosstab(x,data['ProductPitched'],normalize='index')
tab2.plot(kind='bar',stacked=True,figsize=(10,5))
plt.show()
stacked_plot_util(data['Age'])
ProductPitched Basic Deluxe King Standard Super Deluxe All Age 18.0 14 0 0 0 0 14 19.0 31 0 0 1 0 32 20.0 37 0 0 1 0 38 21.0 40 1 0 0 0 41 22.0 43 3 0 0 0 46 23.0 41 5 0 0 0 46 24.0 46 10 0 0 0 56 25.0 43 31 0 0 0 74 26.0 61 44 0 1 0 106 27.0 83 45 1 9 0 138 28.0 91 43 1 12 0 147 29.0 109 52 0 16 1 178 30.0 105 67 0 26 1 199 31.0 83 87 0 32 1 203 32.0 72 97 0 27 1 197 33.0 75 88 0 22 4 189 34.0 83 96 0 27 5 211 35.0 88 98 0 50 1 237 36.0 78 92 0 61 0 231 37.0 58 81 0 46 0 185 38.0 54 81 0 41 0 176 39.0 33 70 5 33 9 150 40.0 25 61 14 29 17 146 41.0 35 59 19 26 16 155 42.0 36 54 18 16 18 142 43.0 32 51 11 15 21 130 44.0 35 33 8 15 14 105 45.0 34 40 9 22 11 116 46.0 13 44 12 33 19 121 47.0 4 32 11 23 18 88 48.0 10 24 5 13 13 65 49.0 8 20 9 15 13 65 50.0 10 19 22 13 22 86 51.0 17 19 18 15 21 90 52.0 13 17 9 14 15 68 53.0 7 14 10 12 23 66 54.0 0 17 6 17 21 61 55.0 3 17 12 14 18 64 56.0 13 9 13 7 16 58 57.0 9 5 3 5 7 29 58.0 7 6 3 8 7 31 59.0 13 7 5 12 7 44 60.0 6 8 4 9 2 29 61.0 0 4 2 3 0 9 All 1698 1651 230 741 342 4662 ------------------------------------------------------------------------------------------------------------------------
# looking at which columns have the most missing values
data.isnull().sum().sort_values()
ProdTaken 0 OwnCar 0 PitchSatisfactionScore 0 Passport 0 MaritalStatus 0 Designation 0 NumberOfPersonVisited 0 ProductPitched 0 Occupation 0 CityTier 0 Gender 0 TypeofContact 25 PreferredPropertyStar 26 NumberOfFollowups 45 NumberOfChildrenVisited 66 NumberOfTrips 140 Age 226 MonthlyIncome 233 DurationOfPitch 251 dtype: int64
# Impute missing DurationOfPitch,NumberOfTrips,NumberOfChildrenVisited,NumberOfFollowups
data["Age"] = data["Age"].transform(
lambda x: x.fillna(x.median())
)
data["DurationOfPitch"] = data["DurationOfPitch"].transform(
lambda x: x.fillna(x.median())
)
data["NumberOfTrips"] = data["NumberOfTrips"].transform(
lambda x: x.fillna(x.median())
)
data["NumberOfChildrenVisited"] = data["NumberOfChildrenVisited"].transform(
lambda x: x.fillna(x.median())
)
data["NumberOfFollowups"] = data["NumberOfFollowups"].transform(
lambda x: x.fillna(x.median())
)
data["PreferredPropertyStar"] = data["PreferredPropertyStar"].transform(
lambda x: x.fillna(x.median())
)
data["TypeofContact"] = data['TypeofContact'].fillna(data.TypeofContact.mode()[0])
data.isnull().sum().sort_values()
ProdTaken 0 NumberOfChildrenVisited 0 OwnCar 0 PitchSatisfactionScore 0 Passport 0 NumberOfTrips 0 MaritalStatus 0 PreferredPropertyStar 0 Designation 0 ProductPitched 0 NumberOfPersonVisited 0 Gender 0 Occupation 0 DurationOfPitch 0 CityTier 0 TypeofContact 0 Age 0 NumberOfFollowups 0 MonthlyIncome 233 dtype: int64
data=pd.get_dummies(data, columns=["TypeofContact", "Occupation","Gender","ProductPitched","MaritalStatus","Designation"],prefix=["TypeofContact", "Occupation","Gender","ProductPitched","MaritalStatus","Designation"], drop_first=True)
data_cat=data['ProdTaken']
# Dropping the columns which are not required while imputation.
data.drop(['ProdTaken'], axis=1,inplace=True)
# External package used pip to install it
# https://stackoverflow.com/questions/44239269/fancyimpute-installation-in-anaconda
from fancyimpute import KNN
# calling the KNN class
knn_imputer = KNN()
# imputing the missing value with knn imputer
imputed_data = knn_imputer.fit_transform(data)
data = pd.DataFrame(imputed_data,columns=data.columns)
Imputing row 1/4888 with 0 missing, elapsed time: 3.670 Imputing row 101/4888 with 0 missing, elapsed time: 3.671 Imputing row 201/4888 with 1 missing, elapsed time: 3.671 Imputing row 301/4888 with 0 missing, elapsed time: 3.671 Imputing row 401/4888 with 0 missing, elapsed time: 3.672 Imputing row 501/4888 with 0 missing, elapsed time: 3.672 Imputing row 601/4888 with 0 missing, elapsed time: 3.673 Imputing row 701/4888 with 0 missing, elapsed time: 3.673 Imputing row 801/4888 with 0 missing, elapsed time: 3.674 Imputing row 901/4888 with 0 missing, elapsed time: 3.674 Imputing row 1001/4888 with 0 missing, elapsed time: 3.675 Imputing row 1101/4888 with 0 missing, elapsed time: 3.675 Imputing row 1201/4888 with 0 missing, elapsed time: 3.676 Imputing row 1301/4888 with 0 missing, elapsed time: 3.676 Imputing row 1401/4888 with 0 missing, elapsed time: 3.676 Imputing row 1501/4888 with 0 missing, elapsed time: 3.677 Imputing row 1601/4888 with 0 missing, elapsed time: 3.677 Imputing row 1701/4888 with 0 missing, elapsed time: 3.678 Imputing row 1801/4888 with 0 missing, elapsed time: 3.678 Imputing row 1901/4888 with 1 missing, elapsed time: 3.678 Imputing row 2001/4888 with 0 missing, elapsed time: 3.679 Imputing row 2101/4888 with 0 missing, elapsed time: 3.679 Imputing row 2201/4888 with 0 missing, elapsed time: 3.680 Imputing row 2301/4888 with 0 missing, elapsed time: 3.680 Imputing row 2401/4888 with 0 missing, elapsed time: 3.681 Imputing row 2501/4888 with 0 missing, elapsed time: 3.681 Imputing row 2601/4888 with 0 missing, elapsed time: 3.681 Imputing row 2701/4888 with 0 missing, elapsed time: 3.681 Imputing row 2801/4888 with 0 missing, elapsed time: 3.681 Imputing row 2901/4888 with 0 missing, elapsed time: 3.682 Imputing row 3001/4888 with 0 missing, elapsed time: 3.682 Imputing row 3101/4888 with 0 missing, elapsed time: 3.682 Imputing row 3201/4888 with 0 missing, elapsed time: 3.682 Imputing row 3301/4888 with 0 missing, elapsed time: 3.682 Imputing row 3401/4888 with 0 missing, elapsed time: 3.682 Imputing row 3501/4888 with 0 missing, elapsed time: 3.683 Imputing row 3601/4888 with 0 missing, elapsed time: 3.683 Imputing row 3701/4888 with 0 missing, elapsed time: 3.683 Imputing row 3801/4888 with 0 missing, elapsed time: 3.683 Imputing row 3901/4888 with 0 missing, elapsed time: 3.683 Imputing row 4001/4888 with 0 missing, elapsed time: 3.683 Imputing row 4101/4888 with 0 missing, elapsed time: 3.684 Imputing row 4201/4888 with 0 missing, elapsed time: 3.684 Imputing row 4301/4888 with 0 missing, elapsed time: 3.684 Imputing row 4401/4888 with 0 missing, elapsed time: 3.684 Imputing row 4501/4888 with 0 missing, elapsed time: 3.684 Imputing row 4601/4888 with 0 missing, elapsed time: 3.684 Imputing row 4701/4888 with 0 missing, elapsed time: 3.685 Imputing row 4801/4888 with 0 missing, elapsed time: 3.685
# Concatenate the two dataframes(missing value imputed variables with other target variable that we separated before imputation)
data=pd.concat([data,data_cat],axis=1)
data.columns
Index(['Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisited',
'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips',
'Passport', 'PitchSatisfactionScore', 'OwnCar',
'NumberOfChildrenVisited', 'MonthlyIncome',
'TypeofContact_Self Enquiry', 'Occupation_Large Business',
'Occupation_Salaried', 'Occupation_Small Business', 'Gender_Male',
'ProductPitched_Deluxe', 'ProductPitched_King',
'ProductPitched_Standard', 'ProductPitched_Super Deluxe',
'MaritalStatus_Married', 'MaritalStatus_Single',
'MaritalStatus_Unmarried', 'Designation_Executive',
'Designation_Manager', 'Designation_Senior Manager', 'Designation_VP',
'ProdTaken'],
dtype='object')
data.head()
| Age | CityTier | DurationOfPitch | NumberOfPersonVisited | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | ... | ProductPitched_Standard | ProductPitched_Super Deluxe | MaritalStatus_Married | MaritalStatus_Single | MaritalStatus_Unmarried | Designation_Executive | Designation_Manager | Designation_Senior Manager | Designation_VP | ProdTaken | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41.0 | 3.0 | 6.0 | 3.0 | 3.0 | 3.0 | 1.0 | 1.0 | 2.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
| 1 | 49.0 | 1.0 | 14.0 | 3.0 | 4.0 | 4.0 | 2.0 | 0.0 | 3.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 |
| 2 | 37.0 | 1.0 | 8.0 | 3.0 | 4.0 | 3.0 | 7.0 | 1.0 | 3.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 |
| 3 | 33.0 | 1.0 | 9.0 | 2.0 | 3.0 | 3.0 | 2.0 | 1.0 | 5.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 |
| 4 | 36.0 | 1.0 | 8.0 | 2.0 | 3.0 | 4.0 | 1.0 | 0.0 | 5.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 |
5 rows × 29 columns
# look at which columns have the most missing values
data.isnull().sum().sort_values(ascending=False)
Age 0 Occupation_Small Business 0 Designation_VP 0 Designation_Senior Manager 0 Designation_Manager 0 Designation_Executive 0 MaritalStatus_Unmarried 0 MaritalStatus_Single 0 MaritalStatus_Married 0 ProductPitched_Super Deluxe 0 ProductPitched_Standard 0 ProductPitched_King 0 ProductPitched_Deluxe 0 Gender_Male 0 Occupation_Salaried 0 CityTier 0 Occupation_Large Business 0 TypeofContact_Self Enquiry 0 MonthlyIncome 0 NumberOfChildrenVisited 0 OwnCar 0 PitchSatisfactionScore 0 Passport 0 NumberOfTrips 0 PreferredPropertyStar 0 NumberOfFollowups 0 NumberOfPersonVisited 0 DurationOfPitch 0 ProdTaken 0 dtype: int64
ZERO!! Hoho!!
numeric_columns=['Age','DurationOfPitch','MonthlyIncome','NumberOfTrips']
# lets look at box plot to see if outliers has been treated or not
plt.figure(figsize=(10,10))
for i, variable in enumerate(numeric_columns):
plt.subplot(3,3,i+1)
plt.boxplot(data[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# Utiliy Move all outliers at 3.5IQ
def _outliers(df,col):
Q1=df[col].quantile(0.25) # 25th quantile
Q3=df[col].quantile(0.75) # 75th quantile
IQR=Q3-Q1
MAX_CAP=3.5*IQR
Lower_Whisker = Q1 - MAX_CAP
Upper_Whisker = Q3 + MAX_CAP
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker) # all the values samller than Lower_Whisker will be assigned value of Lower_whisker
# and all the values above upper_whishker will be assigned value of upper_Whisker
return df
def utility_process_outliers(df, col_list):
for c in col_list:
df = _outliers(df,c)
return df
data = utility_process_outliers(data,numeric_columns)
# lets look at box plot to see if outliers has been treated or not
plt.figure(figsize=(20,20))
for i, variable in enumerate(numeric_columns):
plt.subplot(3,3,i+1)
plt.boxplot(data[variable],whis=3.5)
plt.tight_layout()
plt.title(variable)
plt.show()
Bye bye outliers!
data
| Age | CityTier | DurationOfPitch | NumberOfPersonVisited | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | ... | ProductPitched_Standard | ProductPitched_Super Deluxe | MaritalStatus_Married | MaritalStatus_Single | MaritalStatus_Unmarried | Designation_Executive | Designation_Manager | Designation_Senior Manager | Designation_VP | ProdTaken | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41.0 | 3.0 | 6.0 | 3.0 | 3.0 | 3.0 | 1.0 | 1.0 | 2.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
| 1 | 49.0 | 1.0 | 14.0 | 3.0 | 4.0 | 4.0 | 2.0 | 0.0 | 3.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 |
| 2 | 37.0 | 1.0 | 8.0 | 3.0 | 4.0 | 3.0 | 7.0 | 1.0 | 3.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 |
| 3 | 33.0 | 1.0 | 9.0 | 2.0 | 3.0 | 3.0 | 2.0 | 1.0 | 5.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 |
| 4 | 36.0 | 1.0 | 8.0 | 2.0 | 3.0 | 4.0 | 1.0 | 0.0 | 5.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4883 | 49.0 | 3.0 | 9.0 | 3.0 | 5.0 | 4.0 | 2.0 | 1.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
| 4884 | 28.0 | 1.0 | 31.0 | 4.0 | 5.0 | 3.0 | 3.0 | 1.0 | 3.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 |
| 4885 | 52.0 | 3.0 | 17.0 | 4.0 | 4.0 | 4.0 | 7.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1 |
| 4886 | 19.0 | 3.0 | 16.0 | 3.0 | 4.0 | 3.0 | 3.0 | 0.0 | 5.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 |
| 4887 | 36.0 | 1.0 | 14.0 | 4.0 | 4.0 | 4.0 | 3.0 | 1.0 | 3.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1 |
4888 rows × 29 columns
X = data.drop("ProdTaken", axis=1)
y = data.pop("ProdTaken")
#Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.30,stratify=y,random_state=42)
## Function to create confusion matrix
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
def get_metrics_score(model,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list=[]
#Predicting on train and tests
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
#Accuracy of the model
train_acc = model.score(X_train,y_train)
test_acc = model.score(X_test,y_test)
#Recall of the model
train_recall = metrics.recall_score(y_train,pred_train)
test_recall = metrics.recall_score(y_test,pred_test)
#Precision of the model
train_precision = metrics.precision_score(y_train,pred_train)
test_precision = metrics.precision_score(y_test,pred_test)
#f1-Score of the model
train_f1_score = metrics.f1_score(y_train,pred_train)
test_f1_score = metrics.f1_score(y_test,pred_test)
score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision,train_f1_score,test_f1_score))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(X_train,y_train))
print("Accuracy on test set : ",model.score(X_test,y_test))
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
print("Precision on training set : ",metrics.precision_score(y_train,pred_train))
print("Precision on test set : ",metrics.precision_score(y_test,pred_test))
print("F1 Score on training set : ",metrics.f1_score(y_train,pred_train))
print("F1 Score on test set : ",metrics.f1_score(y_test,pred_test))
return score_list # returning the list with train and test scores
DTModel = DecisionTreeClassifier(criterion='gini',class_weight={0:0.18,1:0.82},random_state=1)
DTModel.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.18, 1: 0.82}, random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
Decision_Tree_Score=get_metrics_score(DTModel)
Accuracy on training set : 1.0 Accuracy on test set : 0.885480572597137 Recall on training set : 1.0 Recall on test set : 0.6340579710144928 Precision on training set : 1.0 Precision on test set : 0.7231404958677686 F1 Score on training set : 1.0 F1 Score on test set : 0.6756756756756757
make_confusion_matrix(DTModel,y_test)
# Choose the type of classifier.
dtree_tuned = DecisionTreeClassifier(random_state=1,class_weight = {0:.18,1:.82})
# Grid of parameters to choose from
parameters = {
'max_depth': np.arange(10,15),
'criterion': ['entropy','gini'],
'splitter': ['best','random'],
#'min_impurity_decrease': [0.000001,0.00001,0.0001],
# 'max_features': ['log2','sqrt']
'max_features': ['sqrt'],
'min_samples_leaf': [1, 2, 5, 7, 10,15,20],
'min_samples_split':[70,80,90,100],
#'max_leaf_nodes' : [5, 10,15,20,25,30],
#'min_impurity_decrease': [0.0001,0.001,0.01,0.1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(dtree_tuned, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
dtree_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
dtree_tuned.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.18, 1: 0.82}, max_depth=12,
max_features='sqrt', min_samples_leaf=10,
min_samples_split=80, random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
Decision_Tree_Tuned_Score=get_metrics_score(dtree_tuned)
Accuracy on training set : 0.7570885705933937 Accuracy on test set : 0.7191547375596455 Recall on training set : 0.8090062111801242 Recall on test set : 0.6847826086956522 Precision on training set : 0.4239218877135883 Precision on test set : 0.36770428015564205 F1 Score on training set : 0.5563267485317672 F1 Score on test set : 0.4784810126582279
make_confusion_matrix(dtree_tuned,y_test)
# View the Decision tree
column_names = list(X.columns)
feature_names = column_names
plt.figure(figsize=(20,30))
out = tree.plot_tree(dtree_tuned,feature_names=feature_names,filled=True,fontsize=9,node_ids=False,class_names=None,)
#below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
# Find gini importance for features
print (pd.DataFrame(dtree_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp Passport 0.256258 Designation_Executive 0.136301 PreferredPropertyStar 0.093465 DurationOfPitch 0.074378 CityTier 0.070469 Age 0.043917 MonthlyIncome 0.041987 Designation_Manager 0.038681 Occupation_Large Business 0.030324 MaritalStatus_Single 0.027529 PitchSatisfactionScore 0.025922 ProductPitched_Standard 0.023181 NumberOfPersonVisited 0.022349 NumberOfTrips 0.022067 NumberOfFollowups 0.019855 Occupation_Salaried 0.015335 OwnCar 0.013689 ProductPitched_Super Deluxe 0.012819 Occupation_Small Business 0.009842 MaritalStatus_Married 0.009112 Gender_Male 0.008727 TypeofContact_Self Enquiry 0.003344 NumberOfChildrenVisited 0.000448 ProductPitched_Deluxe 0.000000 ProductPitched_King 0.000000 MaritalStatus_Unmarried 0.000000 Designation_Senior Manager 0.000000 Designation_VP 0.000000
feature_names = X_train.columns
importances = dtree_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Top features are : Passport,Designation_Executive, Age, CityTier,Preferred PropertyStar
#base_estimator for bagging classifier is a decision tree by default
bagging_estimator=BaggingClassifier(random_state=1)
bagging_estimator.fit(X_train,y_train)
BaggingClassifier(random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
bagging_estimator_score=get_metrics_score(bagging_estimator)
Accuracy on training set : 0.9932768196433791 Accuracy on test set : 0.9188820722563054 Recall on training set : 0.9658385093167702 Recall on test set : 0.677536231884058 Precision on training set : 0.9983948635634029 Precision on test set : 0.8617511520737328 F1 Score on training set : 0.9818468823993686 F1 Score on test set : 0.7586206896551725
make_confusion_matrix(bagging_estimator,y_test)
We see that BaggingClassifier is overfitting the training data
#Train the random forest classifier
rf_estimator=RandomForestClassifier(random_state=1)
rf_estimator.fit(X_train,y_train)
RandomForestClassifier(random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
rf_estimator_score=get_metrics_score(rf_estimator)
Accuracy on training set : 1.0 Accuracy on test set : 0.9011588275391956 Recall on training set : 1.0 Recall on test set : 0.5507246376811594 Precision on training set : 1.0 Precision on test set : 0.8786127167630058 F1 Score on training set : 1.0 F1 Score on test set : 0.6770601336302895
make_confusion_matrix(rf_estimator,y_test)
# Choose the type of classifier.
bagging_estimator_tuned = BaggingClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {'max_samples': [0.7,0.8,0.9,1],
'max_features': [0.7,0.8,0.9,1],
'n_estimators' : [10,20,30,40,50],
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(bagging_estimator_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
bagging_estimator_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
bagging_estimator_tuned.fit(X_train, y_train)
BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=40,
random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
bagging_estimator_tuned_score=get_metrics_score(bagging_estimator_tuned)
Accuracy on training set : 0.9985384390529085 Accuracy on test set : 0.9100204498977505 Recall on training set : 0.9922360248447205 Recall on test set : 0.6086956521739131 Precision on training set : 1.0 Precision on test set : 0.875 F1 Score on training set : 0.9961028838659391 F1 Score on test set : 0.717948717948718
make_confusion_matrix(bagging_estimator_tuned,y_test)
# Try logilist regreesion to se if it is better
bagging_lr=BaggingClassifier(base_estimator=LogisticRegression(random_state=1,max_iter=1000),random_state=1)
bagging_lr.fit(X_train,y_train)
BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,
random_state=1),
random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
bagging_lr_score=get_metrics_score(bagging_lr)
Accuracy on training set : 0.8427360420929553 Accuracy on test set : 0.8289025221540559 Recall on training set : 0.2795031055900621 Recall on test set : 0.26811594202898553 Precision on training set : 0.7086614173228346 Precision on test set : 0.6016260162601627 F1 Score on training set : 0.4008908685968819 F1 Score on test set : 0.3709273182957394
make_confusion_matrix(bagging_lr,y_test)
# Choose the type of classifier.
rf_estimator_tuned = RandomForestClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {"n_estimators": [150,200,250],
"min_samples_leaf": [5,6,7,8,9],
"max_features": [0.2,0.3,0.4,0.5,0.6],
"max_samples": [0.3,0.4,0.5,0.6]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
rf_estimator_tuned.fit(X_train, y_train)
RandomForestClassifier(max_features=0.6, max_samples=0.6, min_samples_leaf=5,
n_estimators=250, random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
rf_estimator_tuned_score=get_metrics_score(rf_estimator_tuned)
Accuracy on training set : 0.9187372113417129 Accuracy on test set : 0.8629856850715747 Recall on training set : 0.6009316770186336 Recall on test set : 0.40217391304347827 Precision on training set : 0.9485294117647058 Precision on test set : 0.7551020408163265 F1 Score on training set : 0.7357414448669202 F1 Score on test set : 0.524822695035461
make_confusion_matrix(rf_estimator_tuned,y_test)
# Choose the type of classifier.
rf_estimator_weighted = RandomForestClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {
"class_weight": [{0: 0.18, 1: 0.82}],
"n_estimators": [100,150,200,250],
"min_samples_leaf": [5,6,7,8,9],
"max_features": [0.2,0.3,0.4,0.5,0.6,0.7,0.8],
"max_samples": [0.3,0.4,0.5,0.6],
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(rf_estimator_weighted, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
rf_estimator_weighted = grid_obj.best_estimator_
# Fit the best algorithm to the data.
rf_estimator_weighted.fit(X_train, y_train)
RandomForestClassifier(class_weight={0: 0.18, 1: 0.82}, max_features=0.4,
max_samples=0.6, min_samples_leaf=9, random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
rf_estimator_weighted_score=get_metrics_score(rf_estimator_weighted)
Accuracy on training set : 0.8897983045893014 Accuracy on test set : 0.8241308793456033 Recall on training set : 0.8369565217391305 Recall on test set : 0.605072463768116 Precision on training set : 0.6646115906288532 Precision on test set : 0.5284810126582279 F1 Score on training set : 0.740893470790378 F1 Score on test set : 0.5641891891891893
make_confusion_matrix(rf_estimator_weighted,y_test)
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(pd.DataFrame(rf_estimator_weighted.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp Passport 0.160249 Age 0.110462 DurationOfPitch 0.097228 Designation_Executive 0.092186 MonthlyIncome 0.089233 CityTier 0.056613 PreferredPropertyStar 0.056591 NumberOfTrips 0.043482 PitchSatisfactionScore 0.043194 NumberOfFollowups 0.038833 MaritalStatus_Single 0.031259 MaritalStatus_Married 0.023780 Gender_Male 0.018654 ProductPitched_Deluxe 0.013379 Designation_Manager 0.012751 Occupation_Salaried 0.012736 Occupation_Large Business 0.012363 TypeofContact_Self Enquiry 0.011614 MaritalStatus_Unmarried 0.011467 NumberOfPersonVisited 0.011426 OwnCar 0.011101 Occupation_Small Business 0.011072 NumberOfChildrenVisited 0.010680 Designation_Senior Manager 0.007695 ProductPitched_Standard 0.006364 ProductPitched_Super Deluxe 0.003967 ProductPitched_King 0.000906 Designation_VP 0.000715
importances = rf_estimator_weighted.feature_importances_
indices = np.argsort(importances)
feature_names = list(X.columns)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Top 3 features: Passport, Age and Duration of Pitch
# defining list of models
models = [DTModel,dtree_tuned,bagging_estimator,bagging_estimator_tuned,bagging_lr,rf_estimator,rf_estimator_tuned,
rf_estimator_weighted]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
f1_score_train=[]
f1_score_test=[]
# looping through all the models to get the accuracy, precall and precision and f1- scores
for model in models:
j = get_metrics_score(model,False)
acc_train.append(np.round(j[0],2))
acc_test.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_test.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_test.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_test.append(np.round(j[7],2))
comparison_frame = pd.DataFrame({'Model':['Decision Tree Model','Tuned Decision Tree','Bagging classifier with default parameters','Tuned Bagging Classifier',
'Bagging classifier with base_estimator=LR', 'Random Forest with deafult parameters',
'Tuned Random Forest Classifier','Random Forest with class_weights'],
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test,
'Train_f1_score':f1_score_train,'Test_f1_score':f1_score_test})
comparison_frame
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_f1_score | Test_f1_score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Decision Tree Model | 1.00 | 0.89 | 1.00 | 0.63 | 1.00 | 0.72 | 1.00 | 0.68 |
| 1 | Tuned Decision Tree | 0.76 | 0.72 | 0.81 | 0.68 | 0.42 | 0.37 | 0.56 | 0.48 |
| 2 | Bagging classifier with default parameters | 0.99 | 0.92 | 0.97 | 0.68 | 1.00 | 0.86 | 0.98 | 0.76 |
| 3 | Tuned Bagging Classifier | 1.00 | 0.91 | 0.99 | 0.61 | 1.00 | 0.88 | 1.00 | 0.72 |
| 4 | Bagging classifier with base_estimator=LR | 0.85 | 0.83 | 0.29 | 0.26 | 0.73 | 0.61 | 0.42 | 0.36 |
| 5 | Random Forest with deafult parameters | 1.00 | 0.90 | 1.00 | 0.55 | 1.00 | 0.88 | 1.00 | 0.68 |
| 6 | Tuned Random Forest Classifier | 0.92 | 0.86 | 0.60 | 0.40 | 0.95 | 0.76 | 0.74 | 0.52 |
| 7 | Random Forest with class_weights | 0.89 | 0.82 | 0.84 | 0.61 | 0.66 | 0.53 | 0.74 | 0.56 |
X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.30,stratify=y,random_state=42)
abc = AdaBoostClassifier(random_state=1)
abc.fit(X_train,y_train)
AdaBoostClassifier(random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
abc_score=get_metrics_score(abc)
Accuracy on training set : 0.8570593393744519 Accuracy on test set : 0.8302658486707567 Recall on training set : 0.3804347826086957 Recall on test set : 0.29347826086956524 Precision on training set : 0.7313432835820896 Precision on test set : 0.6 F1 Score on training set : 0.5005107252298263 F1 Score on test set : 0.39416058394160586
make_confusion_matrix(abc,y_test)
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
GradientBoostingClassifier(random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
gbc_score=get_metrics_score(gbc)
Accuracy on training set : 0.8897983045893014 Accuracy on test set : 0.8520790729379687 Recall on training set : 0.4968944099378882 Recall on test set : 0.358695652173913 Precision on training set : 0.8579088471849866 Precision on test set : 0.7122302158273381 F1 Score on training set : 0.6293018682399214 F1 Score on test set : 0.47710843373493966
make_confusion_matrix(gbc,y_test)
xgb = XGBClassifier(eval_metric = "logloss",use_label_encoder=False,random_state=1)
xgb.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
use_label_encoder=False, validate_parameters=1, verbosity=None)
#Using above defined function to get accuracy, recall and precision on train and test set
xgb_score=get_metrics_score(xgb)
Accuracy on training set : 0.9994153756211634 Accuracy on test set : 0.912747102931152 Recall on training set : 0.9968944099378882 Recall on test set : 0.6485507246376812 Precision on training set : 1.0 Precision on test set : 0.8523809523809524 F1 Score on training set : 0.9984447900466563 F1 Score on test set : 0.736625514403292
make_confusion_matrix(xgb,y_test)
# Choose the type of classifier.
abc_tuned = AdaBoostClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {
#Let's try different max_depth for base_estimator
"base_estimator":[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3)],
"n_estimators": np.arange(10,110,10),
"learning_rate":np.arange(0.1,1,0.1)
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(abc_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
abc_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
abc_tuned.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
learning_rate=0.9, n_estimators=100, random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
abc_tuned_score=get_metrics_score(abc_tuned)
Accuracy on training set : 0.9885998246126864 Accuracy on test set : 0.8793456032719836 Recall on training set : 0.9503105590062112 Recall on test set : 0.5978260869565217 Precision on training set : 0.9886914378029079 Precision on test set : 0.7142857142857143 F1 Score on training set : 0.9691211401425179 F1 Score on test set : 0.6508875739644971
make_confusion_matrix(abc_tuned,y_test)
# importance of features in the tree building
print(pd.DataFrame(abc_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp MonthlyIncome 0.275644 Age 0.136991 DurationOfPitch 0.122789 PitchSatisfactionScore 0.063600 NumberOfTrips 0.051955 Gender_Male 0.041313 NumberOfFollowups 0.040540 PreferredPropertyStar 0.033258 Designation_Executive 0.028656 Passport 0.023137 CityTier 0.020931 TypeofContact_Self Enquiry 0.020685 NumberOfChildrenVisited 0.017205 OwnCar 0.015766 Occupation_Small Business 0.014056 Occupation_Salaried 0.013280 MaritalStatus_Single 0.011436 Occupation_Large Business 0.011357 MaritalStatus_Married 0.010061 NumberOfPersonVisited 0.009913 ProductPitched_Deluxe 0.008957 ProductPitched_Super Deluxe 0.007810 MaritalStatus_Unmarried 0.006990 Designation_Manager 0.005521 ProductPitched_Standard 0.003821 Designation_Senior Manager 0.002678 Designation_VP 0.001648 ProductPitched_King 0.000000
importances = abc_tuned.feature_importances_
indices = np.argsort(importances)
feature_names = list(X.columns)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
MonthlyIncome is the most important feature as per the tuned AdaBoost model.
Let's try using AdaBoost classifier as the estimator for initial predictions
gbc_init = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
gbc_init.fit(X_train,y_train)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
random_state=1)
#Using above defined function to get accuracy, recall and precision on train and test set
gbc_init_score=get_metrics_score(gbc_init)
Accuracy on training set : 0.8892136802104648 Accuracy on test set : 0.8513974096796183 Recall on training set : 0.4984472049689441 Recall on test set : 0.3695652173913043 Precision on training set : 0.8514588859416445 Precision on test set : 0.6986301369863014 F1 Score on training set : 0.6287952987267386 F1 Score on test set : 0.48341232227488157
# Choose the type of classifier.
gbc_tuned = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(gbc_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
gbc_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
gbc_tuned.fit(X_train, y_train)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.8, n_estimators=250, random_state=1,
subsample=0.9)
#Using above defined function to get accuracy, recall and precision on train and test set
gbc_tuned_score=get_metrics_score(gbc_tuned)
Accuracy on training set : 0.9277988892136803 Accuracy on test set : 0.8718473074301295 Recall on training set : 0.6630434782608695 Recall on test set : 0.4528985507246377 Precision on training set : 0.9343544857768052 Precision on test set : 0.7716049382716049 F1 Score on training set : 0.7756584922797456 F1 Score on test set : 0.5707762557077626
make_confusion_matrix(gbc_tuned,y_test)
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(pd.DataFrame(gbc_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp MonthlyIncome 0.165651 Passport 0.136120 Age 0.132986 Designation_Executive 0.121614 DurationOfPitch 0.074846 NumberOfFollowups 0.054757 CityTier 0.049158 PreferredPropertyStar 0.045516 NumberOfTrips 0.037461 MaritalStatus_Single 0.032994 PitchSatisfactionScore 0.027305 MaritalStatus_Unmarried 0.020803 Occupation_Large Business 0.015526 Gender_Male 0.015357 TypeofContact_Self Enquiry 0.011544 MaritalStatus_Married 0.009890 ProductPitched_Standard 0.008916 Designation_Senior Manager 0.006001 NumberOfChildrenVisited 0.005383 Occupation_Small Business 0.005150 NumberOfPersonVisited 0.004862 Designation_Manager 0.004333 Occupation_Salaried 0.004238 ProductPitched_Deluxe 0.003857 OwnCar 0.002692 ProductPitched_Super Deluxe 0.002678 ProductPitched_King 0.000358 Designation_VP 0.000000
indices = np.argsort(importances)
feature_names = list(X.columns)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
MonthlyIncome is the most important feature, followed by Age and DurationOfPitch, as per the tuned gradient boosting model
def timer(start_time=None):
if not start_time:
start_time=datetime.now()
return start_time
elif start_time:
thour,temp_sec=divmod((datetime.now()-start_time).total_seconds(),3600)
tmin,tsec=divmod(temp_sec,60)
print('/n Time taken: %i hours %i minute and %s seconds.' %(thour,tmin,round(tsec,2)))
# Choose the type of classifier.
xgb_tuned = XGBClassifier(eval_metric='logloss',use_label_encoder=False,random_state=1)
# Grid of parameters to choose from
## add from
parameters = {
#"n_estimators": np.arange(10,100,20),
#"n_estimators": [10,30,50,70,90],
"n_estimators": [1,50,100],
"scale_pos_weight":[0,1,2,5],
#"subsample":[0.5,0.7,0.9,1],
"max_depth":[3,4,5,8,10,12,15],
"learning_rate":[0.05,0.1,0.2,0.5],
#"gamma":[0,1,3],
#"gamma":[0.0,0.1,0.2,0.3,0.4],
"colsample_bytree":[0.5,0.7,0.9,1]
#"colsample_bylevel":[0.5,0.7,0.9,1]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(xgb_tuned, parameters,scoring=acc_scorer,cv=5)
from datetime import datetime
start_time=timer(None)
grid_obj = grid_obj.fit(X_train, y_train)
timer(start_time)
/n Time taken: 0 hours 12 minute and 37.41 seconds.
# Set the clf to the best combination of parameters
xgb_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
xgb_tuned.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=5, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=5, subsample=1, tree_method='exact',
use_label_encoder=False, validate_parameters=1, verbosity=None)
#Using above defined function to get accuracy, recall and precision on train and test set
xgb_tuned_score=get_metrics_score(xgb_tuned)
Accuracy on training set : 0.9318912598655364 Accuracy on test set : 0.8438991138377642 Recall on training set : 0.9580745341614907 Recall on test set : 0.7355072463768116 Precision on training set : 0.7496962332928311 Precision on test set : 0.5654596100278552 F1 Score on training set : 0.8411724608043626 F1 Score on test set : 0.6393700787401575
make_confusion_matrix(xgb_tuned,y_test)
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(pd.DataFrame(xgb_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp Passport 0.165111 Designation_Executive 0.126801 CityTier 0.052116 MaritalStatus_Married 0.045593 PreferredPropertyStar 0.045417 Occupation_Large Business 0.042107 ProductPitched_Deluxe 0.041128 MaritalStatus_Unmarried 0.037262 MaritalStatus_Single 0.035901 ProductPitched_Standard 0.035176 Age 0.033748 NumberOfFollowups 0.033352 ProductPitched_Super Deluxe 0.032282 NumberOfTrips 0.028366 PitchSatisfactionScore 0.028122 DurationOfPitch 0.027224 Gender_Male 0.026798 MonthlyIncome 0.025614 ProductPitched_King 0.025329 TypeofContact_Self Enquiry 0.024982 Occupation_Salaried 0.020559 Occupation_Small Business 0.019453 OwnCar 0.019064 NumberOfPersonVisited 0.015439 NumberOfChildrenVisited 0.013058 Designation_Manager 0.000000 Designation_Senior Manager 0.000000 Designation_VP 0.000000
importances = xgb_tuned.feature_importances_
indices = np.argsort(importances)
feature_names = list(X.columns)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Passport is the most important feature as per XGBoost model unlike AdaBoost and Gradient Boosting, where the most important feature is the MonthlyIncome.
# defining list of models
models = [DTModel,dtree_tuned,bagging_estimator,bagging_estimator_tuned,bagging_lr,rf_estimator,rf_estimator_tuned,
rf_estimator_weighted,abc, abc_tuned, gbc, gbc_init, gbc_tuned, xgb, xgb_tuned]
#models = [abc, abc_tuned, gbc, gbc_init, gbc_tuned, xgb, xgb_tuned,stacking_estimator]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
f1_score_train = []
f1_score_test = []
# looping through all the models to get the accuracy, precall and precision scores
for model in models:
j = get_metrics_score(model,False)
acc_train.append(np.round(j[0],2))
acc_test.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_test.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_test.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_test.append(np.round(j[7],2))
comparison_frame = pd.DataFrame({'Model':['Decision Tree Model','Tuned Decision Tree','Bagging classifier with default parameters','Tuned Bagging Classifier',
'Bagging classifier with base_estimator=LR', 'Random Forest with deafult parameters',
'Tuned Random Forest Classifier','Random Forest with class_weights','AdaBoost with default paramters','AdaBoost Tuned',
'Gradient Boosting with default parameters','Gradient Boosting with init=AdaBoost',
'Gradient Boosting Tuned','XGBoost with default parameters','XGBoost Tuned'],
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test,
'Train_F1_Score':f1_score_train,'Test_F1_Score':f1_score_test})
comparison_frame
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1_Score | Test_F1_Score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Decision Tree Model | 1.00 | 0.89 | 1.00 | 0.63 | 1.00 | 0.72 | 1.00 | 0.68 |
| 1 | Tuned Decision Tree | 0.76 | 0.72 | 0.81 | 0.68 | 0.42 | 0.37 | 0.56 | 0.48 |
| 2 | Bagging classifier with default parameters | 0.99 | 0.92 | 0.97 | 0.68 | 1.00 | 0.86 | 0.98 | 0.76 |
| 3 | Tuned Bagging Classifier | 1.00 | 0.91 | 0.99 | 0.61 | 1.00 | 0.88 | 1.00 | 0.72 |
| 4 | Bagging classifier with base_estimator=LR | 0.85 | 0.83 | 0.29 | 0.26 | 0.73 | 0.61 | 0.42 | 0.36 |
| 5 | Random Forest with deafult parameters | 1.00 | 0.90 | 1.00 | 0.55 | 1.00 | 0.88 | 1.00 | 0.68 |
| 6 | Tuned Random Forest Classifier | 0.92 | 0.86 | 0.60 | 0.40 | 0.95 | 0.76 | 0.74 | 0.52 |
| 7 | Random Forest with class_weights | 0.89 | 0.82 | 0.84 | 0.61 | 0.66 | 0.53 | 0.74 | 0.56 |
| 8 | AdaBoost with default paramters | 0.86 | 0.83 | 0.38 | 0.29 | 0.73 | 0.60 | 0.50 | 0.39 |
| 9 | AdaBoost Tuned | 0.99 | 0.88 | 0.95 | 0.60 | 0.99 | 0.71 | 0.97 | 0.65 |
| 10 | Gradient Boosting with default parameters | 0.89 | 0.85 | 0.50 | 0.36 | 0.86 | 0.71 | 0.63 | 0.48 |
| 11 | Gradient Boosting with init=AdaBoost | 0.89 | 0.85 | 0.50 | 0.37 | 0.85 | 0.70 | 0.63 | 0.48 |
| 12 | Gradient Boosting Tuned | 0.93 | 0.87 | 0.66 | 0.45 | 0.93 | 0.77 | 0.78 | 0.57 |
| 13 | XGBoost with default parameters | 1.00 | 0.91 | 1.00 | 0.65 | 1.00 | 0.85 | 1.00 | 0.74 |
| 14 | XGBoost Tuned | 0.93 | 0.84 | 0.96 | 0.74 | 0.75 | 0.57 | 0.84 | 0.64 |
[1] https://josephbalog.com/wp-content/uploads/2021/07/JBalog_ET_Project_GTTravel_Package_Purchase_Prediction.pdf [2] https://www.kaggle.com/jordanrich/predictive-sales-operations-pre-launch-modelling [3] https://github.com/SrujanaBandla/Ensemble-Techniques-Travel-Package-Purchase-Prediction